import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly
plotly.offline.init_notebook_mode()
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
from rpy2.robjects import r, pandas2ri
base = importr('base')
stats = importr('stats')
graphics = importr('graphics')
utils = importr('utils')
ade4 = importr('ade4')
nlme = importr('nlme')
lme4 = importr('lme4')
lmertest = importr('lmerTest')
from rpy2.robjects.conversion import localconverter
import rpy2.ipython.html
rpy2.ipython.html.init_printing()
def create_dict(keys, values):
'''
create dictionary from two lists
'''
dictionary = dict(zip(keys, values))
return dictionary
def grouping_pop(df, level, start_rename):
'''
creating summary tables with mean, std and n values per group defined by level
'''
df_mean = df.groupby(level).mean().reset_index().dropna(axis = 1)
temp_dict = [create_dict(df_mean.columns[start_rename::], df_mean.columns[start_rename::]+ i) for i in ['_mean', '_std']]
df_mean = df_mean.rename(columns = temp_dict[0])
df_std = df.groupby(level).std().reset_index().dropna(axis = 1, thresh=5)
df_std= df_std.rename(columns = temp_dict[1])
df_n = df.groupby(level).size().reset_index(name='counts')
return df_mean, df_std, df_n
def plot_heatmap(cor, mode, ann=False):
'''
plot heatmap from a correlation matrix, 2 modes avalaible
abs: compute the absolute values of the correlation matrix (values bounded between [0-1])
cor: raw correlation values (values bounded between [-1-1])
'''
assert mode in ['cor','abs'], 'mode should be one of ["cor","abs"]'
if mode == 'cor':
vm = -1
else :
vm = 0
cor = np.abs(cor)
fig, ax = plt.subplots(figsize=(15, 15))# mask
mask = np.triu(np.ones_like(cor, dtype=np.bool))# adjust mask and df
mask = mask[1:, :-1]
corr = cor.iloc[1:,:-1].copy()# plot heatmap
sns.heatmap(corr, mask=mask, annot=ann, fmt=".2f", cmap='Blues',
vmin=vm, vmax=1, cbar_kws={"shrink": .8})# yticks
plt.yticks(rotation=0)
plt.show()
def mean_confidence_interval(sm, n, confidence=0.95, verbose = False):
'''
compute the delta to mean value for computing confidence interval
sm : standard deviation of the mean
n : number of individuals
return the delta to mean value for computing confidence interval (to be added to the mean value)
'''
from scipy import stats
def compute_t(confidence, ni):
t=stats.t.ppf((1 + confidence) / 2., ni-1)
return t
tval = [compute_t(confidence, ni) for ni in n]
n = np.array(n)
sm = np.array(sm)
tval = np.array(tval)
h = np.array(sm)/np.sqrt(n) * np.array(tval)
if verbose:
print('t values : {}'.format(np.around(tval, decimals=2)))
print('\nse values : {}'.format(np.around(sm, decimals=2)))
print('\nci values : {}'.format(np.around(h, decimals=2)))
return h
class MyPCA():
'''
file:///home/xavier/Downloads/fr_Tanagra_ACP_Python.pdf
'''
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
def __init__(self, df):
self.df = df
self.n = df.shape[0]
self.p = df.shape[1]
def standardize(self):
sc = StandardScaler()
self.Z = sc.fit_transform(self.df)
def dopca(self):
try:
self.Z
except:
print("Z is not defined please standardize before")
self.acp = PCA(svd_solver='full')
self.coord = self.acp.fit_transform(self.Z)
self.eigval = self.acp.explained_variance_
def assess_pca(self):
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(8,6))
ax1.bar(np.arange(1,self.p +1),self.acp.explained_variance_)
ax1.set_title("Scree plot")
ax1.set_ylabel("Eigen values")
ax1.set_xlabel("Factor number")
ax2.bar(np.arange(1,self.p+1),np.cumsum(self.acp.explained_variance_ratio_))
ax2.set_title("Explained variance vs. # of factors")
ax2.set_ylabel("Cumsum explained variance ratio")
ax2.set_xlabel("Factor number")
plt.show()
def plot_indiv(self, label):
fig, axes = plt.subplots(figsize=(12,12))
axes.set_xlim(-6,6) #même limites en abscisse
axes.set_ylim(-6,6) #et en ordonnée
#placement des étiquettes des observations
assert self.n == label.shape[0], 'rows number should have the same length than label'
for i in range(self.n):
plt.annotate(label[i],(self.coord[i,0],self.coord[i,1]))
#ajouter les axes
plt.plot([-6,6],[0,0],color='silver',linestyle='-',linewidth=1)
plt.plot([0,0],[-6,6],color='silver',linestyle='-',linewidth=1)#affichage
plt.show()
def _compute_corvar(self):
print('computing factor variable correlation')
sqrt_eigval = np.sqrt(self.eigval)
#corrélation des variables avec les axes
self.corvar = np.zeros((self.p,self.p))
for k in range(self.p):
self.corvar[:,k] = self.acp.components_[k,:] * sqrt_eigval[k]
#afficher la matrice des corrélations variables x facteurs
def plot_features(self, label):
self._compute_corvar()
fig, axes = plt.subplots(figsize=(8,8))
axes.set_xlim(-1,1)
axes.set_ylim(-1,1)
#affichage des étiquettes (noms des variables)
assert self.p == label.shape[0], 'cols number should have the same length than label'
for j in range(self.p):
plt.annotate(label[j],(self.corvar[j,0],self.corvar[j,1]))
#ajouter les axes
plt.plot([-1,1],[0,0],color='silver',linestyle='-',linewidth=1)
plt.plot([0,0],[-1,1],color='silver',linestyle='-',linewidth=1)
#ajouter un cercle
cercle = plt.Circle((0,0),1,color='blue',fill=False)
axes.add_artist(cercle)
#affichage
plt.show()
def compute_cos2(self):
try:
self.corvar
except:
print("corvar is not defined, use plot_features before")
self.cos2var = self.corvar**2
print('Axis 1\n---------------------------------------------\n')
print(pd.DataFrame({'id':self.df.columns,'COS2_1':self.cos2var[:,0],'COS2_2':self.cos2var[:,1]}).sort_values('COS2_1', ascending = False))
print('Axis 2\n---------------------------------------------\n')
print(pd.DataFrame({'id':self.df.columns,'COS2_1':self.cos2var[:,0],'COS2_2':self.cos2var[:,1]}).sort_values('COS2_2', ascending = False))
Temperature

Aridity Index (lower = more arid)

in black : Pinus pinaster populations
Trabucco, A., and Zomer, R.J. 2018. Global Aridity Index and Potential Evapo-Transpiration (ET0) Climate Database v2. CGIAR Consortium for Spatial Information (CGIAR-CSI). Published online, available from the CGIAR-CSI GeoPortal at https://cgiarcsi.community
# import df
df = pd.read_table("/home/xavier/Documents/research/FORMANRISK/data/data_formanrisk/individual_join.csv", sep = ";")
# remove few columns
df = df.drop(columns = ["X","Y",'X_TYPE_', 'X_FREQ_', 'individual', 'branch_diam', 'branch_diamn','num',
'P50n','P12n','P88n','slopen','Kmaxn'])
df = df.drop(columns=['email', 'info', 'x.1', 'y.1'])
print('dimensions of df are \nnrows : {0}\nncols : {1}'.format(df.shape[0], df.shape[1]))
#Â remove the _15 from bioclim var
df.columns = [re.sub("_15", "", c) for c in df.columns]
df = df.rename(columns={'AI':'bio20'})
# extracting index of bioclim var
bio_index = [i for i, item in enumerate(df.columns) if re.search('bio\d{1,2}', item)]
# renaming bioclim var with meaningful names
keys = ["bio1" ,"bio2" ,"bio3" ,"bio4" ,"bio5" ,"bio6" ,"bio7" ,"bio8" ,"bio9" ,"bio10" ,"bio11" ,"bio12" ,"bio13" ,"bio14" ,"bio15" ,"bio16" ,"bio17" ,"bio18" ,"bio19", 'bio20']
values = ["Tmean_annual" ,"Mean_D_range" ,"Isothermality" ,"T_seasonality" ,"Tmax_warmerM" ,"Tmin_coldestM" ,"T_annual_range" ,"Tmean_wettestQ" ,"Tmean_driestQ" ,"Tmean_warmerQ" ,"Tmean_coldestQ" ,"P_annual" ,"P_wettestM" ,"P_driestM" ,"P_seasonality" ,"P_wettestQ" ,"P_driestQ" ,"P_warmestQ" ,"P_coldestQ", "Aridity_Index"]
dictionary = create_dict(keys,values)
df = df.rename(columns = dictionary)
df_oin = df[(df.site == 'oin_fr') | (df.site == 'oin_P') | (df.site == 'oin_es')].reset_index()
df_oin = df_oin[['P50', 'site', 'Treatment']]
# keep only pop from oin_es
if False:
df = df[(df.site != 'oin_fr') & (df.site != 'oin_P')].reset_index()
print(df.site.unique())
# Convert all pop from oin to oin_es whatever the origin
if True:
df.loc[(df.site == 'oin_fr') | (df.site == 'oin_P'),'site'] = 'oin_es'
print(df.site.unique())
# remove all pop from oin
if False:
df = df[(df.site != 'oin_fr') & (df.site != 'oin_P') & (df.site != 'oin_es')].reset_index()
print(df.site.unique())
# creating summary tables with mean, std and n values per group defined by level
df_pop_mean,df_pop_std,df_pop_n = grouping_pop(df=df, level=['Species','site'], start_rename=2)
# extracting labels of columns of interest
label_num = df_pop_mean.iloc[:,2::].columns
# concat mean, std and n summary tables
df_pop_mean = pd.concat([df_pop_mean,df_pop_std,df_pop_n], axis = 1)
# remove duplicated columns
df_pop_mean =df_pop_mean.loc[:,~df_pop_mean.columns.duplicated()]
# creating summary tables with mean, std and n values per group defined by level
df_pop_mean_T,df_pop_std_T ,df_pop_n_T = grouping_pop(df=df, level=['Species','site','Treatment'], start_rename=3)
# concat mean, std and n summary tables
df_pop_mean_T = pd.concat([df_pop_mean_T ,df_pop_std_T ,df_pop_n_T ], axis = 1)
# remove duplicated columns
df_pop_mean_T =df_pop_mean_T.loc[:,~df_pop_mean_T.columns.duplicated()]
fig = px.box(df_oin, x="site", y="P50", color = 'Treatment')
fig.show()
# Import the linear regression model class
from pymer4.models import Lmer, Lm
# Initialize model using 2 predictors and sample data
model = Lm("P50 ~ Treatment ", data=df_oin)
# Fit it
print(model.fit())
# Import the linear regression model class
from pymer4.models import Lmer
# Initialize model using 2 predictors and sample data
model = Lmer("P50 ~ Treatment + (1|site)", data=df_oin)
# Fit it
print(model.fit())
import statsmodels.api as sm
from statsmodels.formula.api import ols
oin_lm = ols('P50 ~ C(Treatment, Sum)+C(site, Sum)',
data=df_oin).fit()
table = sm.stats.anova_lm(oin_lm, typ=2) # Type 2 ANOVA DataFrame
print(table)
# import python data frame to R global environment
with localconverter(ro.default_converter + pandas2ri.converter):
df_oin_r = ro.conversion.py2rpy(df_oin[['P50','site', 'Treatment']])
base.dim(df_oin_r)
%load_ext rpy2.ipython
%%R -i df_oin_r
require(basics)
%%R
lm1 = lm(P50 ~ site + Treatment, data = df_oin_r)
summary(lm1)
%%R
anova(lm1)
python & R statistical packages leads to the same F and p values.
# keeping only p pinaster pop
df_pp = df[df.Species == "pinus pinaster"]
df_mean_pp = df_pop_mean[df_pop_mean.Species == "pinus pinaster"]
df_mean_pp_T = df_pop_mean_T[df_pop_mean_T.Species == "pinus pinaster"]
fig = px.bar(df_mean_pp_T, x='site', y='counts',
hover_data=['Treatment','counts'], color='Treatment',
labels={'counts':'Nb indivs per site per Treatment (P. pinaster)'},
height=400, barmode='group')
fig.show()
Data are cleaned, let's see now for some data exploration
fig = px.histogram(df_pp, x="P50", color = 'Treatment', marginal="rug",
hover_data=df_pp.columns)
fig.show()
print("Mean P50 value is {0:.3f} with sd {1:.3f}".format(df_pp.P50.mean(),df_pp.P50.std()))
print('\n'.join('{}: {:.3f}'.format(['P50 adult', 'P50 young'][k],j) for k,j in enumerate(df_pp.groupby('Treatment').P50.mean().values)))
fig = px.box(df_pp, x="site", y="P50", color = 'Treatment')
fig.show()
mci = mean_confidence_interval(sm = df_mean_pp_T.loc[df_mean_pp_T['Treatment']=='adult', "P50_std"],
n = df_mean_pp_T.loc[df_mean_pp_T['Treatment']=='adult', "counts"],
verbose = False)
import matplotlib.pyplot as plt
plt.figure(figsize = (10,10))
X = df_mean_pp_T.loc[df_mean_pp_T['Treatment']=='young', "P50_mean"]
Y = df_mean_pp_T.loc[df_mean_pp_T['Treatment']=='adult', "P50_mean"]
plt.errorbar(X, Y, yerr=mci, fmt='o', label = 'mean P50 + CI')
plt.xlim([-4.15,-3.55])
plt.ylim([-4.15,-3.55])
m, b = np.polyfit(X, Y, 1)
plt.plot(X, m*X + b, label = 'fitted line')
plt.plot( [-5,1],[-5,1], c = 'green', lw=2, label = 'identity line')
plt.legend()
plt.show()
fig = px.scatter(x=df_mean_pp_T.loc[df_mean_pp_T['Treatment']=='young', "P50_mean"],
y=df_mean_pp_T.loc[df_mean_pp_T['Treatment']=='adult', "P50_mean"],
trendline="ols",
error_y=mci,
error_y_minus=mci,
labels={
"x": "P50 mean young",
"y": "P50 Mean adult"
},
title="P50 of adults vs P50 pof youngs per population with 95% confidence interval of the mean",
range_x=[-4.15,-3.55],
range_y=[-4.15,-3.55],
width = 700,
height = 700)
fig.show()
results = px.get_trendline_results(fig)
print('Statistics summary\n-------------------------\n')
results.px_fit_results.iloc[0].summary()
There is a significant correlation between young and adult P50 (R² = 0.54, F=15.24, p = 0.0018) The estimated value of the slope is 0.69 (t = 3.9, p = 0.002)
ratio = df_mean_pp_T.loc[df_mean_pp_T['Treatment']=='young', "P50_mean"].values/df_mean_pp_T.loc[df_mean_pp_T['Treatment']=='adult', "P50_mean"].values
df_mean_pp_T['P50_ratio']=np.repeat(ratio,2)
df_mean_pp_T['P50_ratio'].plot(kind='density')
plt.scatter(df_mean_pp_T['Aridity_Index_mean'], df_mean_pp_T['P50_ratio'])
plt.ylim([0.8,1.2])
plt.xlim([0,1])
# mean values per population
df_corr= df_mean_pp[label_num].corr()
plot_heatmap(cor=df_corr, mode='abs', ann=True)
Strongest correlation between P50 and:
import plotly.express as px
fig = px.scatter(df_pp, x="Tmean_annual", y="P50", color="Treatment", trendline="ols", title = 'P50 vs Tmean')
fig.show()
results = px.get_trendline_results(fig)
print(results)
# results.query("Treatment == 'adult'").px_fit_results.iloc[0].summary()
# results.query("Treatment == 'young'").px_fit_results.iloc[0].summary()
import plotly.express as px
fig = px.scatter(df_pp, x="Aridity_Index", y="P50", color="Treatment", trendline="ols", title = 'P50 vs Aridity_index')
fig.show()
results = px.get_trendline_results(fig)
print(results)
Very weak R² between individuals measure of P50 and T mean annual of the pop (R² = 0.02 and 0.06)
Without the Treatment effect
df_mean_pp_wide = df_mean_pp[["Tmean_annual_mean",
'Tmean_coldestQ_mean',
'T_seasonality_mean',
'T_annual_range_mean',
'Tmin_coldestM_mean',
"P50_mean",
"Aridity_Index_mean",
'site']]
df_mean_pp_wide = pd.melt(df_mean_pp_wide,
id_vars=[
'site',
"P50_mean"],
value_vars=["Tmean_annual_mean",
'Tmean_coldestQ_mean',
'T_seasonality_mean',
'T_annual_range_mean',
'Tmin_coldestM_mean',
"Aridity_Index_mean"
])
df_mean_pp_wide.columns
fig = px.scatter(df_mean_pp_wide,
x="value",
y="P50_mean",
trendline="ols",
text = "site",
facet_col="variable",
facet_col_wrap=3,
facet_row_spacing=0.04, # default is 0.07 when facet_col_wrap is used
facet_col_spacing=0.04, # default is 0.03
height=800, width=1000)
fig.update_traces(textposition='top center')
fig.update_xaxes(matches=None)
fig.show()
NB : Oin (es, fr & P) is a common garden located in corogna (spain) with 3 provenances (spanish, portugese (Leiria) and Frecnh)
With the Treament effect
df_mean_pp_T_wide = df_mean_pp_T[["Tmean_annual_mean",
'Tmean_coldestQ_mean',
'T_seasonality_mean',
'T_annual_range_mean',
'Tmin_coldestM_mean',
"P50_mean",
"Treatment",
"Aridity_Index_mean",
'site']]
df_mean_pp_T_wide = pd.melt(df_mean_pp_T_wide,
id_vars=["Treatment",
'site',
"P50_mean"],
value_vars=["Tmean_annual_mean",
'Tmean_coldestQ_mean',
'T_seasonality_mean',
'T_annual_range_mean',
'Tmin_coldestM_mean',
"Aridity_Index_mean"
])
df_mean_pp_T_wide.columns
fig = px.scatter(df_mean_pp_T_wide,
x="value",
y="P50_mean",
color="Treatment",
trendline="ols",
text = "site",
facet_col="variable",
facet_col_wrap=3,
facet_row_spacing=0.04, # default is 0.07 when facet_col_wrap is used
facet_col_spacing=0.04, # default is 0.03
height=800, width=1000)
fig.update_traces(textposition='top center')
fig.update_xaxes(matches=None)
fig.show()
import plotly.express as px
fig = px.scatter(df_mean_pp_T, x="Aridity_Index_mean", y="P50_mean", color="Treatment", trendline="ols",
text = "site", title = 'P50 vs Aridity_index')
fig.update_traces(textposition='top center')
fig.show()
Results:
Some traits some to be more correlated to p50 as seen on the heatmap, with maybe some slight differences between treatment
mypca = MyPCA(df_mean_pp_T[[v +'_mean' for v in values]] )
mypca.standardize()
mypca.dopca()
mypca.assess_pca()
mypca.plot_indiv(label = df_mean_pp_T.site)
mypca.plot_features(label = df_mean_pp_T[[v +'_mean' for v in values]].columns)
mypca.compute_cos2()
acp_coord = pd.DataFrame(mypca.coord, columns = ['acp_'+str(i) for i in np.arange(0,mypca.coord.shape[1])])
df_mean_pp_T_acp = pd.concat([df_mean_pp_T, acp_coord], axis = 1)
fig = px.scatter(df_mean_pp_T_acp, x="acp_0", y="P50_mean", color="Treatment", trendline="ols",
text = "site")
fig.update_traces(textposition='top center')
fig.show()
fig = px.scatter(df_mean_pp_T_acp, x="acp_1", y="P50_mean", color="Treatment", trendline="ols",
text = "site")
fig.update_traces(textposition='top center')
fig.show()
It seems that the correlation is better between p50 and the second axis of the PCA (R² = 0.23, 0.26) but it is not super wonderful, maybe fit a non linear model
first axis is associated with :
second axis is associated with :
mypca = MyPCA(df_pp.iloc[:,bio_index])
mypca.standardize()
mypca.dopca()
mypca.assess_pca()
mypca.plot_indiv(label = df_pp.site)
acp_coord = pd.DataFrame(mypca.coord, columns = ['acp_'+str(i) for i in np.arange(0,mypca.coord.shape[1])])
df_pp_acp = pd.concat([df_pp, acp_coord], axis = 1)
def create_group(df, group, col = 'site', Name = 'Groupclim'):
gr = 1
for g in group:
for i in g:
df.loc[df[col]==i,Name] = 'group_'+str(gr)
gr+=1
return df
ll=['leiria', 'oin_es', 'san vicente']+['cerbere','ceret', 'perpignan', 'spain dune']
pp= ['biscarrosse', 'cerbere', 'ceret', 'hourtin', 'la teste', 'leiria',
'lit et mixe', 'mimizan', 'oin_es' ,'perpignan', 'ribeira' ,'san vicente','spain dune' ,'branas' ,'orzaduero', 'llanos']
# Group based on ACP clusters
group = [['leiria', 'oin_es', 'san vicente'],
['cerbere','ceret', 'perpignan', 'spain dune'],
['biscarrosse', 'branas', 'hourtin', 'la teste', 'lit et mixe',
'llanos', 'mimizan', 'orzaduero', 'ribeira']]
group = [['leiria', 'oin_es', 'san vicente'],
['cerbere','ceret', 'perpignan', 'spain dune'],
['biscarrosse', 'branas', 'hourtin', 'la teste', 'lit et mixe',
'llanos', 'mimizan', 'orzaduero', 'ribeira']]
df_mean_pp_T_acp= create_group(df= df_mean_pp_T_acp,
group = group,
col = 'site')
df_pp_acp = create_group(df= df_pp_acp,
group = group,
col = 'site')
if True:
df_mean_pp_T_acp.to_csv("/home/xavier/Documents/research/FORMANRISK/analyse/forman_cavit/output/table/df_mean_PP.csv")
df_pp_acp.to_csv("/home/xavier/Documents/research/FORMANRISK/analyse/forman_cavit/output/table/df_PP.csv")
df_pp_acp.columns
# Import the linear regression model class
from pymer4.models import Lm
# Initialize model using 2 predictors and sample data
model = Lm("P50 ~ Treatment + Aridity_Index + site", data=df_pp_acp)
# Fit it
print(model.fit())
import statsmodels.api as sm
from statsmodels.formula.api import ols
oin_lm = ols('P50 ~ C(Treatment)+Aridity_Index',
data=df_pp_acp).fit()
# print(oin_lm.summary())
table = sm.stats.anova_lm(oin_lm, typ=2) # Type 2 ANOVA DataFrame
print(table)
df_restricted = df_pp_acp[['P50','site', 'Treatment', 'acp_0', 'acp_1', 'Groupclim', 'Aridity_Index']]
from pymer4.models import Lmer
# Initialize model instance using 1 predictor with random intercepts
# this is the full model
model = Lmer("P50 ~ (1|site)", data=df_restricted)
# Fit it
print(model.fit())
from pymer4.models import Lmer
# Initialize model instance using 1 predictor with random intercepts
# this is the full model
model = Lmer("P50 ~ Treatment + acp_0 + acp_1 + (1|site)", data=df_restricted)
# Fit it
print(model.fit())
print(model.anova())
from pymer4.models import Lmer
# Initialize model instance using 1 predictor with random intercepts
# this is the full model
model = Lmer("P50 ~ Treatment * Aridity_Index + (1|site)", data=df_restricted)
# Fit it
print(model.fit())
print(model.anova())
# Visualize coefficients with group/cluster fits overlaid ("forest plot")
model.plot_summary()
model.plot("Aridity_Index", plot_ci=False, ylabel="predicted DV")
# Initialize model instance using 1 predictor with random intercepts
# no climatic variables
model = Lmer("P50 ~ Treatment + (1|site)", data=df_restricted)
# Fit it
print(model.fit())
# Initialize model instance using 1 predictor with random intercepts
model = Lmer("P50 ~ Treatment + acp_1 + (Treatment|site)", data=df_restricted)
# Fit it
print(model.fit())
# Initialize model instance using 1 predictor with random intercepts and slopes
model = Lmer("P50 ~ Treatment + acp_1 + (1|site)", data=df_restricted)
# Fit it
print(model.fit())
This is the best model based on AIC value (both models random intercep & random intercept + slope perform very similarly)
No effect of treatment effect on acp_1 which is the second axis of the pca correlated with :
# Visualize coefficients with group/cluster fits overlaid ("forest plot")
model.plot_summary()
model.plot("acp_1", plot_ci=False, ylabel="predicted DV")
# Initialize model instance using 1 predictor with random intercepts
model = Lmer("P50 ~ Groupclim + (1|site)", data=df_restricted)
# Fit it
print(model.fit())
print(model.anova())
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr
from rpy2.robjects import r, pandas2ri
base = importr('base')
stats = importr('stats')
graphics = importr('graphics')
utils = importr('utils')
ade4 = importr('ade4')
nlme = importr('nlme')
lme4 = importr('lme4')
lmertest = importr('lmerTest')
from rpy2.robjects.conversion import localconverter
import rpy2.ipython.html
rpy2.ipython.html.init_printing()
with localconverter(ro.default_converter + pandas2ri.converter):
df_restricted_r = ro.conversion.py2rpy(df_restricted)
base.dim(df_restricted_r)
from rpy2.robjects.packages import importr
utils = importr('utils')
%load_ext rpy2.ipython
%%R -i df_restricted_r
require(tidyverse)
require(dplyr)
require(lme4)
require(nlme)
require(lmerTest)
# glimpse(df_restricted_r)
%%R
par(mfrow=c(2,2))
# df_restricted_r = df_restricted_r[(df_restricted_r[,'site']!='oin_es' & df_restricted_r[,'site']!='san vicente'),]
plot(df_restricted_r[,'P50']~as.factor(df_restricted_r[,'Groupclim']))
plot(df_restricted_r[,'P50']~as.factor(df_restricted_r[,'Treatment']))
plot(df_restricted_r[,'P50']~as.factor(df_restricted_r[,'site']))
plot(df_restricted_r[,'P50']~df_restricted_r[,'Aridity_Index'])
%%R
levels(as.factor(df_restricted_r[,'site']))
%%R
df = df
df_restricted_r[,'Treatment'] = as.factor(df_restricted_r[,'Treatment'] )
df_restricted_r[,'site'] = as.factor(df_restricted_r[,'site'] )
df_restricted_r[,'Groupclim'] = as.factor(df_restricted_r[,'Groupclim'] )
mm1 = lmer(P50 ~ Treatment + acp_1 + (1|site), data = df_restricted_r)
summary(mm1)
%%R
df_restricted_r[,'Treatment'] = as.factor(df_restricted_r[,'Treatment'] )
df_restricted_r[,'site'] = as.factor(df_restricted_r[,'site'] )
df_restricted_r[,'Groupclim'] = as.factor(df_restricted_r[,'Groupclim'] )
mm1 = lmer(P50 ~ Treatment + Groupclim + (1|site), data = df_restricted_r)
summary(mm1)
%%R
df_restricted_r[,'Treatment'] = as.factor(df_restricted_r[,'Treatment'] )
df_restricted_r[,'site'] = as.factor(df_restricted_r[,'site'] )
df_restricted_r[,'Groupclim'] = as.factor(df_restricted_r[,'Groupclim'] )
mm1 = lmer(P50 ~ Treatment + Aridity_Index + (1|site), data = df_restricted_r)
summary(mm1)
%%R
# intraclass correlation coefficient
0.02207/(0.02207+0.08448)
# Design effect
1 + (10-1)*0.2071328
# Neffective
(12*10) / 2.864195
%%R
anova(mm1, type='II')
%%R
AIC(mm1)
%%R
hist(residuals(mm1))
qqnorm(residuals(mm1))
qqline(residuals(mm1))
plot(fitted(mm1)~residuals(mm1))
%%R
plot(residuals(mm1)~as.factor(df_restricted_r[,'Treatment']))
plot(residuals(mm1)~as.factor(df_restricted_r[,'Groupclim']))
%%R
plot(residuals(mm1)~df_restricted_r[,'acp_0'])
plot(residuals(mm1)~df_restricted_r[,'acp_1'])
plot(residuals(mm1, type = 'pearson')~df_restricted_r[,'Aridity_Index'])
%%R
M0 = gls(P50 ~ Treatment + Aridity_Index + acp_1, method = "REML", data = df_restricted_r)
M1 = lme(P50 ~ Treatment + acp_1, random = ~ 1|site, method = "REML", data = df_restricted_r)
M2 = lme(P50 ~ Treatment + Aridity_Index, random = ~ 1|site, method = "REML", data = df_restricted_r)
AIC(M0,M1, M2)
There is a need for a random site effect
The main results are :
# https://jakevdp.github.io/PythonDataScienceHandbook/04.13-geographic-data-with-basemap.html
import os
os.environ['PROJ_LIB'] = r'/home/xavier/anaconda3/pkgs/proj4-5.2.0-he6710b0_1/share/proj'
from mpl_toolkits.basemap import Basemap
# cities = pd.read_csv('data/california_cities.csv')
# Extract the data we're interested in
lat = df_mean_pp.y_mean.values # cities['latd'].values
lon = df_mean_pp.x_mean.values
population = df_mean_pp.P50_mean.values*1
area = df_mean_pp.P50_mean.values*-1
population
# 1. Draw the map background
fig = plt.figure(figsize=(12, 12))
m = Basemap(projection='lcc', resolution='h',
lat_0=df_mean_pp.y_mean.min(), lon_0=df_mean_pp.x_mean.min(),
width=2.2E6, height=1.4E6)
m.shadedrelief()
m.drawcoastlines(color='gray')
m.drawcountries(color='gray')
m.drawstates(color='gray')
# 2. scatter city data, with color reflecting population
# and size reflecting area
m.scatter(lon, lat, latlon=True,
c=population,
s=area,
cmap='Reds',
alpha=0.5)
# 3. create colorbar and legend
plt.colorbar(label=r'$\log_{10}({\rm population})$')
plt.clim(-4.2, -3.5)
# make legend with dummy points
for a in [100, 300, 500]:
plt.scatter([], [], c='k', alpha=0.5, s=a,
label=str(a) + ' km$^2$')
plt.legend(scatterpoints=1, frameon=False,
labelspacing=1, loc='lower left');
# https://towardsdatascience.com/reading-and-visualizing-geotiff-images-with-python-8dcca7a74510
import rasterio
from rasterio.plot import show
strmap = '/home/xavier/Downloads/7504448/global-ai_et0/ai_et0/ai_et0.tif'
dataset = rasterio.open(strmap)
show(dataset)
df_mean_pp.y_mean.tolist()
lat = df_mean_pp_T.y_mean.values.min()# cities['latd'].values
lon = df_mean_pp.x_mean.values
population = df_mean_pp.P50_mean.values*1
area = df_mean_pp.P50_mean.values*-1
Basemap()
# https://stackoverflow.com/questions/55854988/subplots-onto-a-basemap
from mpl_toolkits.basemap import Basemap
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches
# prep values for map extents and more
llcrnrlat = df_mean_pp.y_mean.values.min()-0.5
llcrnrlon = df_mean_pp.x_mean.values.min()-0.5
urcrnrlat = df_mean_pp.y_mean.values.max()+0.5
urcrnrlon = df_mean_pp.x_mean.values.max()+0.8
mid_lon = (urcrnrlon+llcrnrlon)/2.0
hr_lon = (urcrnrlon-llcrnrlon)/2.0
mid_lat = (urcrnrlat+llcrnrlat)/2.0
hr_lat = (urcrnrlat-llcrnrlat)/2.0
# function to create inset axes and plot bar chart on it
# this is good for 3 items bar chart
def build_bar(mapx, mapy, ax, width, xvals=['a','b'], yvals=[1,4], fcolors=['r','b']):
ax_h = inset_axes(ax, width=width, \
height=width, \
loc=3, \
bbox_to_anchor=(mapx, mapy), \
bbox_transform=ax.transData, \
borderpad=0, \
axes_kwargs={'alpha': 0.35, 'visible': True})
for x,y,c in zip(xvals, yvals, fcolors):
ax_h.bar(x, y, label=str(x), fc=c)
#ax.xticks(range(len(xvals)), xvals, fontsize=10, rotation=30)
ax_h.axis('off')
return ax_h
fig, ax = plt.subplots(figsize=(12, 12)) # bigger is better
bm = Basemap(llcrnrlat= llcrnrlat,
llcrnrlon= llcrnrlon,
urcrnrlat= urcrnrlat,
urcrnrlon= urcrnrlon,
ax = ax,
resolution='h',
projection='lcc',
lon_0=mid_lon,
lat_0=mid_lat)
# bm.fillcontinents(color='gray', zorder=0)
# bm.drawcoastlines(color='gray', linewidth=0.3, zorder=2)
bm.shadedrelief()
bm.drawcoastlines(color='gray')
bm.drawcountries(color='gray')
bm.drawstates(color='gray')
plt.title('site_scores', fontsize=20)
# ======================
# make-up some locations
# ----------------------
# you may use 121 here
lon1s = df_mean_pp.x_mean + np.random.normal(loc=0,scale=0.1, size = len(df_mean_pp.x_mean))
lat1s = df_mean_pp.y_mean + np.random.normal(loc=0,scale=0.1, size = len(df_mean_pp.x_mean))
# make-up list of 3-values data for the locations above
# -----------------------------------------------------
bar_data = np.array([[item] for i, item in enumerate(df_mean_pp_T.P50_mean)]).reshape(-1,2).tolist() # list of 3 items lists
# create a barchart at each location in (lon1s,lat1s)
# ---------------------------------------------------
bar_width = 0.1 # inch
colors = ['r','b']
for ix, lon1, lat1 in zip(list(range(len(lon1s))), lon1s, lat1s):
x1, y1 = bm(lon1, lat1) # get data coordinates for plotting
bax = build_bar(x1, y1, ax, 0.2, xvals=['a','b'], \
yvals=bar_data[ix], \
fcolors=colors)
# create legend (of the 3 classes)
patch0 = mpatches.Patch(color=colors[0], label='P50 adult')
patch1 = mpatches.Patch(color=colors[1], label='P50 young')
ax.legend(handles=[patch0,patch1], loc=1)
plt.show()
# https://stackoverflow.com/questions/45677300/how-to-plot-geotiff-data-in-specific-area-lat-lon-with-python
import georaster
import georaster
fig = plt.figure(figsize=(8,8))
# full path to the geotiff file
fpath = r"C:\\path_to_your\geotiff_file\srtm_57_10.tif" # Thailand east
# read extent of image without loading
# good for values in degrees lat/long
# geotiff may use other coordinates and projection
my_image = georaster.SingleBandRaster(strmap, load_data=False)
# grab limits of image's extent
minx, maxx, miny, maxy = my_image.extent
# set Basemap with slightly larger extents
# set resolution at intermediate level "i"
m = Basemap( projection='cyl', \
llcrnrlon=minx-2, \
llcrnrlat=miny-2, \
urcrnrlon=maxx+2, \
urcrnrlat=maxy+2, \
resolution='i')
m.drawcoastlines(color="gray")
m.fillcontinents(color='beige')
# load the geotiff image, assign it a variable
image = georaster.SingleBandRaster( fpath, \
load_data=(minx, maxx, miny, maxy), \
latlon=True)
# plot the image on matplotlib active axes
# set zorder to put the image on top of coastlines and continent areas
# set alpha to let the hidden graphics show through
plt.imshow(image.r, extent=(minx, maxx, miny, maxy), zorder=10, alpha=0.6)
plt.show()
lat = df_mean_pp_T.y_mean.values.min()# cities['latd'].values
lon = df_mean_pp.x_mean.values
population = df_mean_pp.P50_mean.values*1
area = df_mean_pp.P50_mean.values*-1
# https://stackoverflow.com/questions/55854988/subplots-onto-a-basemap
from mpl_toolkits.basemap import Basemap
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.patches as mpatches
# prep values for map extents and more
llcrnrlat = df_mean_pp.y_mean.values.min()-0.5
llcrnrlon = df_mean_pp.x_mean.values.min()-0.5
urcrnrlat = df_mean_pp.y_mean.values.max()+0.5
urcrnrlon = df_mean_pp.x_mean.values.max()+0.8
mid_lon = (urcrnrlon+llcrnrlon)/2.0
hr_lon = (urcrnrlon-llcrnrlon)/2.0
mid_lat = (urcrnrlat+llcrnrlat)/2.0
hr_lat = (urcrnrlat-llcrnrlat)/2.0
# function to create inset axes and plot bar chart on it
# this is good for 3 items bar chart
def build_bar(mapx, mapy, ax, width, xvals=['a','b'], yvals=[1,4], fcolors=['r','b']):
ax_h = inset_axes(ax, width=width, \
height=width, \
loc=3, \
bbox_to_anchor=(mapx, mapy), \
bbox_transform=ax.transData, \
borderpad=0, \
axes_kwargs={'alpha': 0.35, 'visible': True})
for x,y,c in zip(xvals, yvals, fcolors):
ax_h.bar(x, y, label=str(x), fc=c)
#ax.xticks(range(len(xvals)), xvals, fontsize=10, rotation=30)
ax_h.axis('off')
return ax_h
fig, ax = plt.subplots(figsize=(12, 12)) # bigger is better
bm = Basemap(llcrnrlat= llcrnrlat,
llcrnrlon= llcrnrlon,
urcrnrlat= urcrnrlat,
urcrnrlon= urcrnrlon,
ax = ax,
resolution='h',
projection='lcc',
lon_0=mid_lon,
lat_0=mid_lat)
# bm.fillcontinents(color='gray', zorder=0)
# bm.drawcoastlines(color='gray', linewidth=0.3, zorder=2)
bm.shadedrelief()
bm.drawcoastlines(color='gray')
bm.drawcountries(color='gray')
bm.drawstates(color='gray')
plt.title('site_scores', fontsize=20)
# ======================
# make-up some locations
# ----------------------
# you may use 121 here
lon1s = df_mean_pp.x_mean.tolist()
lat1s = df_mean_pp.y_mean.tolist()
# make-up list of 3-values data for the locations above
# -----------------------------------------------------
bd = [[bd[0]-bd[1]] for bd in bar_data] # list of 3 items lists
# create a barchart at each location in (lon1s,lat1s)
# ---------------------------------------------------
bar_width = 0.1 # inch
colors = ['r']
for ix, lon1, lat1 in zip(list(range(len(lon1s))), lon1s, lat1s):
x1, y1 = bm(lon1, lat1) # get data coordinates for plotting
bax = build_bar(x1, y1, ax, 0.2, xvals=['a'], \
yvals=bd[ix], \
fcolors=colors)
# create legend (of the 3 classes)
patch0 = mpatches.Patch(color=colors[0], label='P50 adult')
ax.legend(handles=[patch0,patch1], loc=1)
plt.show()